library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggmosaic)
# set default theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 16))
# set default figure parameters for knitr
knitr::opts_chunk$set(
fig.width = 8,
fig.asp = 0.618,
fig.retina = 2,
dpi = 150,
out.width = "70%"
)
# load data
raw_coffee_data <- read_csv("data/coffee_survey.csv")
## Rows: 4042 Columns: 57
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (44): submission_id, age, cups, where_drink, brew, brew_other, purchase,...
## dbl (13): expertise, coffee_a_bitterness, coffee_a_acidity, coffee_a_persona...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(raw_coffee_data)
## Rows: 4,042
## Columns: 57
## $ submission_id <chr> "gMR29l", "BkPN0e", "W5G8jj", "4xWgGr", "…
## $ age <chr> "18-24 years old", "25-34 years old", "25…
## $ cups <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Less tha…
## $ where_drink <chr> NA, NA, NA, NA, NA, NA, "At a cafe, At th…
## $ brew <chr> NA, "Pod/capsule machine (e.g. Keurig/Nes…
## $ brew_other <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ purchase <chr> NA, NA, NA, NA, NA, NA, "National chain (…
## $ purchase_other <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ favorite <chr> "Regular drip coffee", "Iced coffee", "Re…
## $ favorite_specify <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ additions <chr> "No - just black", "Sugar or sweetener, N…
## $ additions_other <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ dairy <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ sweetener <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ style <chr> "Complex", "Light", "Complex", "Complex",…
## $ strength <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ roast_level <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ caffeine <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ expertise <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ coffee_a_bitterness <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_a_acidity <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_a_personal_preference <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_a_notes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ coffee_b_bitterness <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_b_acidity <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_b_personal_preference <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_b_notes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ coffee_c_bitterness <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_c_acidity <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_c_personal_preference <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_c_notes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ coffee_d_bitterness <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_d_acidity <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_d_personal_preference <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_d_notes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ prefer_abc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ prefer_ad <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ prefer_overall <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wfh <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ total_spend <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ why_drink <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ why_drink_other <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ taste <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ know_source <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ most_paid <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ most_willing <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ value_cafe <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ spent_equipment <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ value_equipment <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ gender <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ gender_specify <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ education_level <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ ethnicity_race <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ ethnicity_race_specify <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ employment_status <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ number_children <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ political_affiliation <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# clean and organize columns
coffee_data_clean <- raw_coffee_data %>%
mutate(across(where(is.character), ~na_if(., "NA"))) %>%
mutate(across(where(is.character), ~na_if(., "")))
coffee_data_clean <- coffee_data_clean %>%
mutate(
total_spend = as.numeric(total_spend)) %>%
mutate(across(c
(political_affiliation,
education_level,
ethnicity_race,
gender,
employment_status,
age),
~factor(.))) %>%
mutate(
brew = str_to_lower(brew),
where_drink = str_to_lower(where_drink)) %>%
mutate(cup_num = case_when(
cups == "Less than 1" ~ 0,
cups == "1" ~ 1,
cups == "2" ~ 2,
cups == "3" ~ 3,
cups == "More than 4" ~ 5,
cups == "4" ~ 4,
TRUE ~ NA_real_
))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `total_spend = as.numeric(total_spend)`.
## Caused by warning:
## ! NAs introduced by coercion
distinct(raw_coffee_data, cups)
## # A tibble: 7 × 1
## cups
## <chr>
## 1 <NA>
## 2 Less than 1
## 3 2
## 4 1
## 5 3
## 6 More than 4
## 7 4
# remove predominately NA columns
prop_missing <- sapply(coffee_data_clean, function(x) mean(is.na(x)))
print(prop_missing)
## submission_id age
## 0.000000000 0.007669471
## cups where_drink
## 0.023008412 0.017318159
## brew brew_other
## 0.095249876 0.832261257
## purchase purchase_other
## 0.824344384 0.992330529
## favorite favorite_specify
## 0.015338941 0.971301336
## additions additions_other
## 0.020534389 0.988124691
## dairy sweetener
## 0.582879762 0.873330035
## style strength
## 0.020781791 0.031172687
## roast_level caffeine
## 0.025235032 0.030925285
## expertise coffee_a_bitterness
## 0.025729837 0.060366155
## coffee_a_acidity coffee_a_personal_preference
## 0.065066799 0.062592776
## coffee_a_notes coffee_b_bitterness
## 0.362196932 0.064819396
## coffee_b_acidity coffee_b_personal_preference
## 0.068035626 0.066551212
## coffee_b_notes coffee_c_bitterness
## 0.392380010 0.068777833
## coffee_c_acidity coffee_c_personal_preference
## 0.071994062 0.068283028
## coffee_c_notes coffee_d_bitterness
## 0.410440376 0.068035626
## coffee_d_acidity coffee_d_personal_preference
## 0.068530430 0.068777833
## coffee_d_notes prefer_abc
## 0.359722909 0.066798615
## prefer_ad prefer_overall
## 0.069520040 0.067293419
## wfh total_spend
## 0.128154379 1.000000000
## why_drink why_drink_other
## 0.117268679 0.958683820
## taste know_source
## 0.118505690 0.119495299
## most_paid most_willing
## 0.127412172 0.131618011
## value_cafe spent_equipment
## 0.134092034 0.132607620
## value_equipment gender
## 0.135576447 0.128401781
## gender_specify education_level
## 0.997031173 0.149430975
## ethnicity_race ethnicity_race_specify
## 0.154379020 0.974022761
## employment_status number_children
## 0.154131618 0.157347848
## political_affiliation cup_num
## 0.186293914 0.023008412
coffee_data_clean <- coffee_data_clean %>%
select(where(~mean(is.na(.)) < 0.9))
# check dataset
str(coffee_data_clean)
## tibble [4,042 × 51] (S3: tbl_df/tbl/data.frame)
## $ submission_id : chr [1:4042] "gMR29l" "BkPN0e" "W5G8jj" "4xWgGr" ...
## $ age : Factor w/ 7 levels "<18 years old",..: 3 4 4 5 4 7 3 NA 4 NA ...
## $ cups : chr [1:4042] NA NA NA NA ...
## $ where_drink : chr [1:4042] NA NA NA NA ...
## $ brew : chr [1:4042] NA "pod/capsule machine (e.g. keurig/nespresso)" "bean-to-cup machine" "coffee brewing machine (e.g. mr. coffee)" ...
## $ brew_other : chr [1:4042] NA NA NA NA ...
## $ purchase : chr [1:4042] NA NA NA NA ...
## $ favorite : chr [1:4042] "Regular drip coffee" "Iced coffee" "Regular drip coffee" "Iced coffee" ...
## $ additions : chr [1:4042] "No - just black" "Sugar or sweetener, No - just black" "No - just black" "No - just black, Cinnamon" ...
## $ dairy : chr [1:4042] NA NA NA NA ...
## $ sweetener : chr [1:4042] NA NA NA NA ...
## $ style : chr [1:4042] "Complex" "Light" "Complex" "Complex" ...
## $ strength : chr [1:4042] NA NA NA NA ...
## $ roast_level : chr [1:4042] NA NA NA NA ...
## $ caffeine : chr [1:4042] NA NA NA NA ...
## $ expertise : num [1:4042] NA NA NA NA NA NA NA NA NA NA ...
## $ coffee_a_bitterness : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_a_acidity : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_a_personal_preference: num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_a_notes : chr [1:4042] NA NA NA NA ...
## $ coffee_b_bitterness : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_b_acidity : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_b_personal_preference: num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_b_notes : chr [1:4042] NA NA NA NA ...
## $ coffee_c_bitterness : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_c_acidity : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_c_personal_preference: num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_c_notes : chr [1:4042] NA NA NA NA ...
## $ coffee_d_bitterness : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_d_acidity : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_d_personal_preference: num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
## $ coffee_d_notes : chr [1:4042] NA NA NA NA ...
## $ prefer_abc : chr [1:4042] NA NA NA NA ...
## $ prefer_ad : chr [1:4042] NA NA NA NA ...
## $ prefer_overall : chr [1:4042] NA NA NA NA ...
## $ wfh : chr [1:4042] NA NA NA NA ...
## $ why_drink : chr [1:4042] NA NA NA NA ...
## $ taste : chr [1:4042] NA NA NA NA ...
## $ know_source : chr [1:4042] NA NA NA NA ...
## $ most_paid : chr [1:4042] NA NA NA NA ...
## $ most_willing : chr [1:4042] NA NA NA NA ...
## $ value_cafe : chr [1:4042] NA NA NA NA ...
## $ spent_equipment : chr [1:4042] NA NA NA NA ...
## $ value_equipment : chr [1:4042] NA NA NA NA ...
## $ gender : Factor w/ 5 levels "Female","Male",..: NA NA NA NA NA NA NA NA NA NA ...
## $ education_level : Factor w/ 6 levels "Bachelor's degree",..: NA NA NA NA NA NA NA NA NA NA ...
## $ ethnicity_race : Factor w/ 6 levels "Asian/Pacific Islander",..: NA NA NA NA NA NA NA NA NA NA ...
## $ employment_status : Factor w/ 6 levels "Employed full-time",..: NA NA NA NA NA NA NA NA NA NA ...
## $ number_children : chr [1:4042] NA NA NA NA ...
## $ political_affiliation : Factor w/ 4 levels "Democrat","Independent",..: NA NA NA NA NA NA NA NA NA NA ...
## $ cup_num : num [1:4042] NA NA NA NA NA NA NA NA 0 NA ...
head(coffee_data_clean)
## # A tibble: 6 × 51
## submission_id age cups where_drink brew brew_other purchase favorite
## <chr> <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 gMR29l 18-24 year… <NA> <NA> <NA> <NA> <NA> Regular…
## 2 BkPN0e 25-34 year… <NA> <NA> pod/… <NA> <NA> Iced co…
## 3 W5G8jj 25-34 year… <NA> <NA> bean… <NA> <NA> Regular…
## 4 4xWgGr 35-44 year… <NA> <NA> coff… <NA> <NA> Iced co…
## 5 QD27Q8 25-34 year… <NA> <NA> pour… <NA> <NA> Latte
## 6 V0LPeM 55-64 year… <NA> <NA> pod/… <NA> <NA> Iced co…
## # ℹ 43 more variables: additions <chr>, dairy <chr>, sweetener <chr>,
## # style <chr>, strength <chr>, roast_level <chr>, caffeine <chr>,
## # expertise <dbl>, coffee_a_bitterness <dbl>, coffee_a_acidity <dbl>,
## # coffee_a_personal_preference <dbl>, coffee_a_notes <chr>,
## # coffee_b_bitterness <dbl>, coffee_b_acidity <dbl>,
## # coffee_b_personal_preference <dbl>, coffee_b_notes <chr>,
## # coffee_c_bitterness <dbl>, coffee_c_acidity <dbl>, …
This data set comes from World Champion Barista James Hoffmann and coffee company Cometeer, who in 2023, collected survey responses corresponding to around 5000 taste testing kits distributed across the country about coffee preferences. I downloaded the data from Kaggle.
Are there significant differences in economic coffee consumption patterns between individuals of different political affiliations within the same education or ethnic group?
cups_poli_edu <- ggplot(coffee_data_clean, aes(
x = political_affiliation,
y = cup_num,
fill = political_affiliation)) +
geom_boxplot() +
facet_wrap(~ education_level) +
labs(title = "Coffee Cups by Political Affiliation and Education Level",
x = "Political Affiliation",
y = "Cups of Coffee per Day",
fill = "Political Affiliation") +
theme_minimal()
ggplotly(cups_poli_edu)
## Warning: Removed 93 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
To create the above graph, I converted the cups variable from a character variable to a numeric by taking the median value of each subset of the data. I then chose to plot it in a box plot faceted across educational attainment and used color to show the distributions of coffee consumption daily by political affiliation. I think a lot can still be done to work on this graph, including improving color choice and reevaluating the labels on the x-axis. I think overall spacing is a struggle at the moment due to the wordy category types that I will need to address moving forward.
convert_most_paid <- function(value) {
case_when(
value == "Less than $2" ~ 1,
value == "More than $20" ~ 22,
TRUE ~ {
nums <- as.numeric(unlist(
str_extract_all(value, "[0-9]+")))
if (length(nums) == 2) {
mean(nums)
} else {
NA_real_
}
}
)
}
coffee_data_clean <- coffee_data_clean %>%
mutate(most_paid_numeric = sapply(
most_paid, convert_most_paid))
ggplot(coffee_data_clean, aes(
x = political_affiliation,
y = most_paid_numeric,
color = education_level)) +
geom_jitter(width = 0.2, alpha = 0.5) +
facet_wrap(~ ethnicity_race)
## Warning: Removed 515 rows containing missing values or values outside the scale range
## (`geom_point()`).
labs(title = "Cafe Value for Money by Political Affiliation and Ethnicity",
x = "Political Affiliation",
y = "Most Paid for Cup of Coffee",
fill = "Education Level")
## $x
## [1] "Political Affiliation"
##
## $y
## [1] "Most Paid for Cup of Coffee"
##
## $fill
## [1] "Education Level"
##
## $title
## [1] "Cafe Value for Money by Political Affiliation and Ethnicity"
##
## attr(,"class")
## [1] "labels"
distinct(coffee_data_clean, most_paid)
## # A tibble: 9 × 1
## most_paid
## <chr>
## 1 <NA>
## 2 $4-$6
## 3 $2-$4
## 4 $10-$15
## 5 $6-$8
## 6 $8-$10
## 7 More than $20
## 8 $15-$20
## 9 Less than $2
distinct(coffee_data_clean, most_willing)
## # A tibble: 9 × 1
## most_willing
## <chr>
## 1 <NA>
## 2 $8-$10
## 3 More than $20
## 4 $15-$20
## 5 $4-$6
## 6 $6-$8
## 7 $10-$15
## 8 $2-$4
## 9 Less than $2
ggplot(coffee_data_clean, aes(
x = most_willing,
fill = education_level)) +
geom_bar(position = "dodge") +
facet_wrap(~ political_affiliation) +
labs(
title = "Most Willing to Pay for a Cup of Coffee by Political Affiliation",
x = "Most Willing Price Range",
y = "Count",
fill = "Education Level"
)
I think in regards to the draft graphs 2 and 3, I need to figure out better spacing or text size for the x-axis as the current format is very difficult to read. I like the idea of faceting these graphs across political affiliation or ethnic group, but I think when I consider their placement into a dashboard, I will need to think more critically about having multiple faceted visualizations and potentially look to visualize in such a way that does not need to be faceted or accounts for the interaction of the user as a means of separating out the interactions expressed by faceting above. Further graphs will definetly explore other economic related variables, as I am really interested in pulling out trends related to political affiliation, ethnicity, education and the idea of value or willingness to pay for coffee products.
Along with these graphs, I would like to create a Shiny Dashboard that would allow a viewer to select specific education levels or ethnic groups that would update visualizations to be able to further explore the visualizations above. I explored this type of dashboard in Tableau, and I really enjoyed the interactive element it incorporated into the dashboard so I would like to incorporate something similar into my final project through Shiny. Ideally, there will be 4-5 different visualizations and potentially a chart on the interactive dashboard.